#coding=utf-8
#聚类分析
import numpy
import random
import pandas as pd
import matplotlib.pyplot as plt
from time import time
import random
start = time()
def count_time():
    minutes = round((time() - start), 2)
    print("程序运行了" + str(minutes) + "秒")

lom = pd.read_csv("junyi_data/problemlog/student_correct.csv")
lom = lom.sort_values(by='num',ascending=False)[:100]#取做题数量前100名进行分析
print(lom)
lom = lom.reset_index(drop = True)
data = []
for i in range(len(lom)):
    moa = []
    moa.append(lom['num'][i])
    moa.append(lom['correct_rate'][i])
    data.append(moa)
count_time()

plt.rcParams['font.sans-serif'] = [u'simHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['axes.facecolor']='#F5F9FC'
plt.rcParams["figure.facecolor"]='#F5F9FC'
plt.rcParams["savefig.facecolor"]='#F5F9FC'
#随机获取k个质心
def findCentroids(data_get, k):
    return random.sample(data_get, k)
#计算向量vecA和向量vecB之间的距离
def calculateDistance(vecA, vecB):
    return numpy.sqrt(numpy.sum(numpy.square(vecA - vecB)))
#计算data_get中的元素与centroidList中k个聚类中心的欧式距离，找出距离最小的，并将该元素加入相应的聚类中
def minDistance(data_get,centroidList):
    clusterDict = dict()#用字典存储聚类结果
    for element in data_get:
        vecA = numpy.array(element)#转换成数组形式
        flag = 0#元素分类标记，记录与相应聚类距离最近的那个类
        minDis = float("inf")#初始化为最大值
        for i in range(len(centroidList)):
            vecB = numpy.array(centroidList[i])
            distance = calculateDistance(vecA, vecB)#两向量间的欧式距离
            if distance < minDis:
                minDis = distance
                flag = i#保存与当前item距离最近的那个聚类的标记
        if flag not in clusterDict.keys():#簇不存在，进行初始化
            clusterDict[flag] = list()
        clusterDict[flag].append(element)#加入相应的类中
    return clusterDict#返回新的聚类结果

#
def getCentroids(clusterDict):
    centroidList = list()
    for key in clusterDict.keys():
        centroid = numpy.mean(numpy.array(clusterDict[key],axis=0))#求聚类中心即求解每列的均值
        centroidList.append(centroid)
    return numpy.array(centroidList).tolist()

#计算聚类间的均方误差
#将类中各个向量与聚类中心的距离进行累加求和
def calculate_Var(clusterDict, centroidList):
    sum = 0.0
    for key in clusterDict.keys():
        vecA = numpy.array(centroidList[key])
        distance = 0.0
        for item in clusterDict[key]:
            vecB = numpy.array(item)
            distance += calculateDistance(vecA, vecB)
        sum += distance
    return sum

#画出聚类结果
def showCluster(centroidList, clusterDict):
    colorMark = ['or','+b','xg','oy','ow']#元素标记
    centroidMark = ['dr','db','dg','dk','dy']#聚类中心标记
    for key in clusterDict.keys():
        plt.plot(centroidList[key][0], centroidList[key][1],centroidMark[key],markersize=2)#画聚类中心

        for item in clusterDict[key]:
            plt.plot(item[0],item[1],colorMark[key])#画类下的点
        plt.title("聚类分析学生做题正确率和做题数量的关系")
        plt.xlabel('做题数量')
        plt.ylabel('正确率')
    plt.savefig("static/assets/img/聚类分析学生做题正确率和做题数量的关系.jpg")
    plt.show()

print(data)
centroidList = findCentroids(data,3)#随机获取三个质心
count_time()
clusterDict = minDistance(data, centroidList)#第一次聚类迭代
count_time()
newVar = calculate_Var(clusterDict, centroidList)#计算均方误差，通过新旧均方误差来获得迭代终止条件
count_time()
oldVar = -0.0001#初始化均方误差
showCluster(centroidList, clusterDict)#展示聚类结果
count_time()
print('over')

